library(rvest)
## Warning: package 'rvest' was built under R version 4.0.5
library(xml2)
## Warning: package 'xml2' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.0.5
library(stringr)
library(purrr)
library(curl)
## Warning: package 'curl' was built under R version 4.0.5
## Using libcurl 7.64.1 with Schannel
library(glue)
## Warning: package 'glue' was built under R version 4.0.5
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(knitr)
## Warning: package 'knitr' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

R Markdown

#getting required urls
base_url <- "https://www.racingaustralia.horse/FreeFields/Calendar_Results.aspx"

state_list <- c("NSW",
                "QLD",
                "ACT",
                "VIC",
                "TAS",
                "SA",
                "WA",
                "NT")

#creating blank list so i can get a url for race results for each state
state_list_url <- c()

#generating urls for each states racing results over the last month
for (i in 1:length(state_list)) {
  state_list_url[[i]] <- paste0(base_url,"?State=", state_list[i])
}

#generating a list of urls to run through to get all race data for last month
results <- function(url_list){
  paste0("https://www.racingaustralia.horse",
         read_html(url_list) %>%
           html_nodes("a") %>%       
           # find all links in the page
           html_attr("href") %>%
           unique() %>%
           str_subset("/FreeFields/Results")
  )
}

#getting list of all urls with race data and converting to one list
race_results_urls <- map(state_list_url, results) %>%
  flatten() 

#separating out trial and race data
trial_results_urls <- race_results_urls[grepl("Trial", race_results_urls)]

racing_results_urls <- race_results_urls[!grepl("Trial", race_results_urls)]

#creating function to extract all tables from each url 
read_html_tbls <- function(url) {
  url = url(url, 'rb')
  html <- read_html(url) %>%
    html_table()
  
  close(url)
  return(html)
}

#creating blank list for race tables to be appended to
race_tables <- c()

#appending all race tables
race_tables <- map(racing_results_urls,read_html_tbls)

#creating blank list for trial tables to be appended to
trial_tables <- c()

#appending all race tables
trial_tables <- map(trial_results_urls,read_html_tbls)

#naming each group of tables with their date and location
names(trial_tables) <- sub('.*Key=','',trial_results_urls)


#naming each group of tables with their date and location
names(race_tables) <- sub('.*Key=','',racing_results_urls)

#removing abandoned races with no data
race_tables_non_abndn <- Filter(function(x) length(x) > 1,race_tables)

#removing abandoned races with no data
trial_tables_non_abndn <- Filter(function(x) length(x) > 1,trial_tables)

#filtering out tables that have non race results or race information
#appending race info table as a character string to the relevant race so it can be used
for (n in 1:length(race_tables_non_abndn)){
  if (ncol(race_tables_non_abndn[[n]][[1]])==4){
    race_tables_non_abndn[[n]] <- race_tables_non_abndn[[n]][-1]
  } else {
    NULL
  }}

  for (i in 1:length(race_tables_non_abndn[[n]])) {
    if ((i %% 2)!=0){
      NA
      } else {
        race_tables_non_abndn[[n]][[i]] <-race_tables_non_abndn[[n]][[i]] %>% 
          dplyr::mutate(character= as.character(race_tables_non_abndn[[n]][[i-1]]),
                 race_info = as.character(names(race_tables_non_abndn[[n]][[i-1]])),
                 info = glue("{names(race_tables_non_abndn)[[n]]}"))
      }
  }



#filtering out tables that have non trial results or race information
#appending race info table as a character string to the relevant race so it can be used
for (n in 1:length(trial_tables_non_abndn)){
  if (ncol(trial_tables_non_abndn[[n]][[1]])==4){
    trial_tables_non_abndn[[n]] <- trial_tables_non_abndn[[n]][-1]
  } else {
    NULL
  }
  for (i in 1:length(trial_tables_non_abndn[[n]])) {
    if (ncol(trial_tables_non_abndn[[n]][[i]])<2){
      NA
    } else {
      trial_tables_non_abndn[[n]][[i]] <- trial_tables_non_abndn[[n]][[i]] %>%
        mutate(character = as.character(trial_tables_non_abndn[[n]][[i-1]]),
               race_info = as.character(names(trial_tables_non_abndn[[n]][[i-1]])),
               info = glue("{names(trial_tables_non_abndn)[[n]]}"))
    }
  }
}

#flattening out both race and trial results
race_results_list <- flatten(race_tables_non_abndn)

trial_results_list <- flatten(trial_tables_non_abndn)

#keeping only tables that have anough columns to contain actual race data, not simply information regarding each race
filtered_race_results_list <- Filter(function(x) ncol(x) == 14,race_results_list)

filtered_trial_results_list <- Filter(function(x) ncol(x) == 14,trial_results_list)

#converting each list into a single df
race_results_combined <- do.call("rbind", filtered_race_results_list) 

trial_results_combined <- do.call("rbind", filtered_trial_results_list) 

#cleaning some data and extracting some information into columns for both race and trial data
race_results_combined <- race_results_combined %>%
  select(-Colour,
         -Penalty) %>%
  mutate(Finish = as.numeric(Finish),
         odds = as.numeric(gsub(".*?([0-9.]+).*", "\\1",`Starting Price`)),
         Weight = as.numeric(gsub("kg.*","", Weight)),
         Date_format = as.Date(str_split(info, ",") %>% map_chr(., 1),
                               format('%Y%b%d')),
         State = as.factor(str_split(info, ",") %>% map_chr(., 2)),
         Racecourse = as.factor(str_split(info, ",") %>% map_chr(., 3)),
         Race = as.factor(str_extract(race_info, "^Race \\d+")),
         Day_of_week = as.factor(format(Date_format, '%a')),
         distance = str_extract_all(race_info,"\\(\\d\\d\\d\\d METRES\\)"),
         track_condition = as.factor(as.character(str_extract_all(race_results_combined$character,"Track Condition: .* \\d+ T"))),
         distance = as.numeric(str_replace_all(distance, c("\\(" = "", "METRES\\)" = ""))),
         winning_time = gsub("Time: ","",str_extract_all(character,"Time: \\d+:\\d+.\\d+ "))
         ) %>%
  na.omit() 

#saveRDS(race_results_combined, glue("C:/Users/owenl/Documents/Owen/R_git/Horse_racing_analysis/Race_results_last_month_{Sys.Date()}.rds"))



trial_results_combined <- trial_results_combined %>%
  select(-Colour,
         -Penalty) %>%
  mutate(Finish = as.numeric(Finish),
         odds = as.numeric(gsub(".*?([0-9.]+).*", "\\1",`Starting Price`)),
         Weight = as.numeric(gsub("kg.*","", Weight)),
         Date_format = as.Date(str_split(info, ",") %>% map_chr(., 1),
                               format('%Y%b%d')),
         State = as.factor(str_split(info, ",") %>% map_chr(., 2)),
         Racecourse = as.factor(str_split(info, ",") %>% map_chr(., 3)),
         Race = as.factor(str_extract(race_info, "^Race \\d+")),
         Day_of_week = as.factor(format(Date_format, '%a')),
         distance = str_extract_all(race_info,"\\(\\d+ METRES\\)"),
         track_condition = as.factor(as.character(str_extract_all(character,"Track Condition: .* \\d+ T"))),
         distance = as.numeric(str_replace_all(distance, c("\\(" = "", "METRES\\)" = ""))),
         winning_time = gsub("Time: ","",str_extract_all(character,"Time: \\d+:\\d+.\\d+ "))
  )
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
#saveRDS(trial_results_combined, glue("C:/Users/owenl/Documents/Owen/R_git/Horse_racing_analysis/Trial_results_last_month_{Sys.Date()}.rds"))

#both trial and race data contain the same columns, so an analysis can be run on both combined if required

all_race_data <- rbind(race_results_combined, trial_results_combined)

kable(head(all_race_data))
Finish No. Horse Trainer Jockey Margin Bar. Weight Starting Price character race_info info odds Date_format State Racecourse Race Day_of_week distance track_condition winning_time
1 10 SWEET AS SCANDI Kevin Lamprecht Wayne Kerford 8 55.0 $21 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 21.0 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34
2 1 COLHOUN Philip Cole Ms Tessa Townsend (a3) 0.1L 3 60.0 $3.40F Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 3.4 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34
3 5 CITY JET Trevor Montgomerie Ms Sonja Wiseman 2.1L 2 56.5 $4.40 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 4.4 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34
4 9 CELTIC BELLE Lisa Whittle Paul Denton 2.5L 5 55.0 $3.50 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 3.5 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34
5 4 FUTURE DREAMS Terry Gillett Ianish Luximon (a3) 5.4L 9 57.5 $11 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 11.0 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34
6 2 KING CRAB Barry Cooke Ms Jessie Philpot 7.1L 4 59.5 $14 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 14.0 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34
# creating time function to approximate a finish time for all horses, not just winning horses
time_function <- function(dataset){
dataset$winning_time_sec<- as.numeric(gsub(":","",str_extract(dataset$winning_time,":\\d+.\\d+")))

dataset$winning_time_min <- as.numeric(str_extract(dataset$winning_time,"\\d+"))*60
dataset$winning_time_total <- dataset$winning_time_min+dataset$winning_time_sec
dataset$Margin <- gsub("L","",dataset$Margin) 
dataset$Margin_Metres <- as.numeric(dataset$Margin)*2.5
dataset$winner_MperSec <- dataset$distance/dataset$winning_time_total

dataset$time <- dataset$winning_time_total+
  ifelse(is.na(dataset$Margin_Metres/dataset$winner_MperSec),
         0,
         dataset$Margin_Metres/dataset$winner_MperSec)
return(dataset)
}

all_race_data <- time_function(all_race_data)

kable(head(all_race_data))
Finish No. Horse Trainer Jockey Margin Bar. Weight Starting Price character race_info info odds Date_format State Racecourse Race Day_of_week distance track_condition winning_time winning_time_sec winning_time_min winning_time_total Margin_Metres winner_MperSec time
1 10 SWEET AS SCANDI Kevin Lamprecht Wayne Kerford 8 55.0 $21 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 21.0 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34 10.34 60 70.34 NA 17.05999 70.34000
2 1 COLHOUN Philip Cole Ms Tessa Townsend (a3) 0.1 3 60.0 $3.40F Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 3.4 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34 10.34 60 70.34 0.25 17.05999 70.35465
3 5 CITY JET Trevor Montgomerie Ms Sonja Wiseman 2.1 2 56.5 $4.40 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 4.4 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34 10.34 60 70.34 5.25 17.05999 70.64774
4 9 CELTIC BELLE Lisa Whittle Paul Denton 2.5 5 55.0 $3.50 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 3.5 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34 10.34 60 70.34 6.25 17.05999 70.70635
5 4 FUTURE DREAMS Terry Gillett Ianish Luximon (a3) 5.4 9 57.5 $11 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 11.0 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34 10.34 60 70.34 13.50 17.05999 71.13133
6 2 KING CRAB Barry Cooke Ms Jessie Philpot 7.1 4 59.5 $14 Of $17,000.1st $11,050, 2nd $3,400, 3rd $2,550Prize money contribution of 1% will be directed to equine welfare prior to distribution Starter Subsidy $600. 0 - 58, Handicap, Minimum Weight 55kg, Apprentices can claim.Track Name: Main Track Type: Sand Track Condition: Good Time: 1:10.34 Last 600m: 0:00.00 Timing Method: ManualOfficial Comments: Late Scratching (Road Storm). No deductions as no bookmakers on course. Race 1 - 2:40PM Darwin Horse Floats Handicap (1200 METRES)
Times displayed in local time of Race Meeting 2022Mar20,NT,Ladbrokes Pioneer Park 14.0 2022-03-20 NT Ladbrokes Pioneer Park Race 1 Sun 1200 character(0) 1:10.34 10.34 60 70.34 17.75 17.05999 71.38045
all_race_data_clean <- all_race_data %>%
  select(-winning_time,
         -winning_time_sec,
         -winning_time_min,
         -winning_time_total,
         -winner_MperSec) %>%
  mutate(RaceID = paste(info,",",Race),
         ID = paste(info,",",Race,"Finish-", Finish),
         Margin_Metres = ifelse(is.na(Margin_Metres)==T, 0, Margin_Metres)) %>%
  filter(time >0,
         is.na(Finish)!=T)

ggplot(all_race_data_clean) +
  geom_point(aes(x=distance,y=time, colour = Finish))

plot_ly(data = all_race_data_clean, x= ~distance, y=~time, z=~Weight, color = ~Finish)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: Ignoring 3166 observations